Load the necessary libraries
library(gbm) #for gradient boosted models
library(car)
library(dismo)
library(pdp)
library(ggfortify)
library(randomForest)
library(tidyverse)
library(patchwork)
library(stars)
Leathwick et al. (2008) compiled capture data of short-finned eels (_Anguilla australis) within New Zealand freshwater streams to explore the distribution of the eels for conservation purposes. The goal was to be able to model the presence/absence of the eels against a range of environmental characteristics so as to predict their more broader occurances and identify which predictors are the most important in the predictions.
eel
Format of leathwick.csv data file
| Site | Angaus | SegSumT | SegTSeas | SegLowFlow | … |
|---|---|---|---|---|---|
| 1 | 0 | 16.0 | -0.10 | 1.036 | … |
| 2 | 1 | 18.7 | 1.51 | 1.003 | … |
| 3 | 0 | 18.3 | 0.37 | 1.001 | … |
| 4 | 0 | 16.7 | -3.80 | 1.000 | … |
| 5 | 1 | 17.2 | 0.33 | 1.005 | … |
| 6 | 0 | 15.1 | 1.83 | 1.015 | … |
| .. | .. | .. | .. | .. | … |
| Site | Unique label for each site. |
| Angaus | Presence (1) or absence (0) of Anguilla australis eels |
| SegSumT | Summer air temperature (degrees celcius) at the river segment |
| scale | |
| SegTSeas | Winter temperature normalised to January temperature at the river |
| segment scale | |
| SegLowFlow | Forth root transformed low flow rate at the river segment scale |
| DSDist | Distance to coast (km) (a downstream predictor) |
| DSDam | Presence of known downsteam obstructions (a downstream predictor) |
| DSMaxSlope | Maximum downstream slope (a downstream predictor) |
| USAvgT | Upstream average tempeture (normalised for the river segment) |
| USRainDays | Number of rainy days recorded in the upstream catchment |
| USSlope | Slope of the river upstream |
| USNative | Percentage of the upstream riparian vegetation that is native |
| Method | Method used to capture the eels (categorical predictor) |
| LocSed | Weighted average of the proportional cover of bed sediment |
| (1=mud, 2=sand, 3=fine gravel, 4=course gravel, 5=cobble, 6=boulder, 7=bedrock) |
leathwick = read_csv('../public/data/leathwick.csv', trim_ws=TRUE)
glimpse(leathwick)
## Rows: 1,000
## Columns: 14
## $ Site <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ Angaus <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0…
## $ SegSumT <dbl> 16.0, 18.7, 18.3, 16.7, 17.2, 15.1, 12.7, 18.1, 18.9, 18.2…
## $ SegTSeas <dbl> -0.10, 1.51, 0.37, -3.80, 0.33, 1.83, 2.17, 1.00, 1.59, 0.…
## $ SegLowFlow <dbl> 1.036, 1.003, 1.001, 1.000, 1.005, 1.015, 1.001, 1.002, 1.…
## $ DSDist <dbl> 50.2000, 132.5300, 107.4400, 166.8200, 3.9500, 11.1700, 42…
## $ DSMaxSlope <dbl> 0.57, 1.15, 0.57, 1.72, 1.15, 1.72, 2.86, 2.29, 0.40, 3.43…
## $ USAvgT <dbl> 0.09, 0.20, 0.49, 0.90, -1.20, -0.20, 1.45, 0.47, 0.25, 0.…
## $ USRainDays <dbl> 2.470, 1.153, 0.847, 0.210, 1.980, 3.300, 0.430, 1.153, 0.…
## $ USSlope <dbl> 9.8, 8.3, 0.4, 0.4, 21.9, 25.7, 9.6, 4.9, 9.8, 20.5, 3.9, …
## $ USNative <dbl> 0.81, 0.34, 0.00, 0.22, 0.96, 1.00, 0.09, 0.02, 0.74, 0.92…
## $ DSDam <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0…
## $ Method <chr> "electric", "electric", "spo", "electric", "electric", "el…
## $ LocSed <dbl> 4.8, 2.0, 1.0, 4.0, 4.7, 4.5, 4.3, NA, NA, 3.6, 3.7, 1.0, …
leathwick = leathwick %>% mutate(Method=factor(Method),
LocSed=factor(LocSed)) %>%
as.data.frame
leathwick_test = read_csv('../public/data/leathwick_test.csv', trim_ws=TRUE)
glimpse(leathwick_test)
## Rows: 500
## Columns: 13
## $ Angaus_obs <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0…
## $ SegSumT <dbl> 16.6, 16.8, 16.3, 15.6, 14.6, 16.7, 18.3, 16.4, 17.2, 15.7…
## $ SegTSeas <dbl> 1.01, -0.51, 0.76, 1.56, -0.20, 0.80, 0.67, 1.44, -0.67, -…
## $ SegLowFlow <dbl> 1.017, 1.002, 1.023, 1.003, 1.023, 1.003, 1.005, 1.031, 1.…
## $ DSDist <dbl> 5.2300, 2.2400, 162.2800, 4.0500, 127.0300, 2.4800, 94.077…
## $ DSMaxSlope <dbl> 0.29, 0.00, 5.14, 0.57, 1.72, 14.57, 0.57, 1.72, 1.72, 2.8…
## $ USAvgT <dbl> -1.40, 0.27, -0.60, 1.14, -1.90, -1.50, 0.54, 0.75, -1.80,…
## $ USRainDays <dbl> 1.980, 0.460, 0.806, 3.300, 1.940, 1.980, 0.847, 2.249, 0.…
## $ USSlope <dbl> 10.0, 0.7, 21.4, 0.9, 28.9, 22.0, 1.0, 4.8, 26.4, 26.1, 23…
## $ USNative <dbl> 1.00, 0.00, 0.66, 0.75, 0.97, 0.99, 0.01, 0.02, 0.74, 0.99…
## $ DSDam <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0…
## $ Method <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ LocSed <dbl> 4.9, 2.3, 4.3, 1.0, NA, 2.8, NA, NA, 4.2, 4.1, 4.3, 1.2, N…
leathwick_test = leathwick_test %>% mutate(Method=factor(Method),
LocSed=factor(LocSed)) %>%
as.data.frame
scatterplotMatrix(~Angaus+SegSumT+SegTSeas+SegLowFlow+DSDist+DSMaxSlope+DSDam+
USAvgT+USRainDays+USSlope+USNative+Method+LocSed, data=leathwick, diagonal=list(method='boxplot'))
In this case, we already have the training and tests sets - there is no need to partition out the data. Nevertheless, it is still worth setting the random seed to ensure repeatibility.
set.seed(123)
leathwick.gbm = gbm(Angaus ~ SegSumT+SegTSeas+SegLowFlow+DSDist+DSMaxSlope+DSDam+
USAvgT+USRainDays+USSlope+USNative+Method+LocSed,
data=leathwick,
distribution='bernoulli',
var.monotone=c(1,1,1,-1,-1,0,1,-1,-1,-1,0,0),
n.trees=10000,
interaction.depth=5,
bag.fraction=0.5,
shrinkage=0.001,
train.fraction=1,
cv.folds=3)
(best.iter = gbm.perf(leathwick.gbm,method='OOB'))
## [1] 1452
## attr(,"smoother")
## Call:
## loess(formula = object$oobag.improve ~ x, enp.target = min(max(4,
## length(x)/10), 50))
##
## Number of Observations: 10000
## Equivalent Number of Parameters: 39.99
## Residual Standard Error: 1.107e-05
(best.iter = gbm.perf(leathwick.gbm,method='cv'))
## [1] 2146
summary(leathwick.gbm, n.trees=best.iter)
nms <- colnames(leathwick)
p <- vector('list', 12)
names(p) <- nms[3:14]
for (nm in nms[3:14]) {
print(nm)
p[[nm]] <- leathwick.gbm %>% pdp::partial(pred.var=nm,
n.trees=best.iter,
inv.link=plogis,
recursive=FALSE,
type='regression') %>%
autoplot() + ylim(0, 0.5)
}
## [1] "SegSumT"
## [1] "SegTSeas"
## [1] "SegLowFlow"
## [1] "DSDist"
## [1] "DSMaxSlope"
## [1] "USAvgT"
## [1] "USRainDays"
## [1] "USSlope"
## [1] "USNative"
## [1] "DSDam"
## [1] "Method"
## [1] "LocSed"
do.call('grid.arrange', p)
ROC curve: receiver operating characteristic curve performance of a classication model at all thresholds. It plots two components: y-axis: True Positive rate (TP/(TP+FN)) x-axis: False Positive rate (FP/(FP+TN)) AUC: Area under ROC curve an aggregate measure of the performance under all thresolds ranges from 0 (0% correct) to 1 (100% correct). max TPR+TNR at: the threshold. Values above this should be considered 1, otherwise 0 (when classifying)
leathwick_test %>%
bind_cols(Pred = predict(leathwick.gbm,newdata=leathwick_test,
n.tree=best.iter, type='response')) %>%
ggplot() +
geom_boxplot(aes(y=Pred, x=as.factor(Angaus_obs))) +
geom_point(aes(y=Pred, x=as.factor(Angaus_obs)), position=position_jitter(width=0.05))
preds <- predict.gbm(leathwick.gbm, newdata=leathwick_test,
n.trees=best.iter, type='response')
preds <- leathwick_test %>%
bind_cols(Pred = predict(leathwick.gbm,newdata=leathwick_test,
n.tree=best.iter, type='response'))
pres <- preds %>% filter(Angaus_obs==1) %>% pull(Pred)
abs <- preds %>% filter(Angaus_obs==0) %>% pull(Pred)
e <- dismo::evaluate(p=pres, a=abs)
e
## class : ModelEvaluation
## n presences : 107
## n absences : 393
## AUC : 0.8109439
## cor : 0.4170651
## max TPR+TNR at : 0.1574458
data(Anguilla_grids)
leathwick.grid = Anguilla_grids
glimpse(leathwick.grid)
## Formal class 'RasterBrick' [package "raster"] with 12 slots
## ..@ file :Formal class '.RasterFile' [package "raster"] with 13 slots
## ..@ data :Formal class '.MultipleRasterData' [package "raster"] with 14 slots
## ..@ legend :Formal class '.RasterLegend' [package "raster"] with 5 slots
## ..@ title : chr(0)
## ..@ extent :Formal class 'Extent' [package "raster"] with 4 slots
## ..@ rotated : logi FALSE
## ..@ rotation:Formal class '.Rotation' [package "raster"] with 2 slots
## ..@ ncols : int 250
## ..@ nrows : int 196
## ..@ crs :Formal class 'CRS' [package "sp"] with 1 slot
## ..@ history : list()
## ..@ z : list()
plot(leathwick.grid)
Method <- factor('electric', levels=levels(leathwick$Method))
Method = as.data.frame(Method)
fit <- predict(leathwick.grid, leathwick.gbm, const=Method,
n.trees=best.iter, type='response')
#fit <- mask(fit, raster(leathwick.grid, 1))
fit= stars::st_as_stars(fit)
ggplot() +
geom_stars(data=fit) +
scale_fill_gradient(low='red', high='green', 'Probability\nof occurrance', na.value=NA) +
coord_sf(expand=FALSE) +
theme_bw()
Leathwick, J. R., J. Elith, W. L. Chadderton, D. Rowe, and T. Hastie. 2008. “Dispersal, Disturbance and the Contrasting Biogeographies of New Zealand’s Diadromous and Non-Diadromous Fish Species.” Journal of Biogeography 35 (8): 1481–97. https://doi.org/https://doi.org/10.1111/j.1365-2699.2008.01887.x.